import pandas as pd
import numpy as np
import seaborn as sb
import plotly.express as px
import matplotlib.pyplot as plot
from collections import Counter
# most Value duplication
def most_common_Vales(value):
value_counts = Counter(value)
most_common, count = value_counts.most_common(1)[0]
return most_common, count
# max value and count duplication
def max_count_value(value):
Max_value=value.max()
count_value_max=np.count_nonzero(value == Max_value)
return Max_value , count_value_max
# min value and count duplication
def min_count_value(value):
min_value=value.min()
count_value_min=np.count_nonzero(value == min_value)
return min_value , count_value_min
#describe df
def describe(col):
if col.dtype !="object":
des=(pd.DataFrame(col).describe().style.background_gradient(cmap='viridis', axis=1))
else:
des=(pd.DataFrame(col).describe(include="object").style.background_gradient(cmap='viridis', axis=1))
return des
# Data trend
def Data_trend(value):
men=value.mean()
miden_=value.median()
if men > miden_ :
data_trend= "Data Trend equel : Right"
elif men < miden_ :
data_trend= "Data Trend equel : Left"
else :
data_trend = "Data Trend equel : symmetric"
return data_trend
def summary(value):
# max value and count duplication
Max_value , count_value =max_count_value(value)
print(f"The maximum value equals : {Max_value} --- The number of Value {Max_value} is equal {count_value}")
print('*'*100)
# min value and count duplication
Min_value ,count_value =min_count_value(value)
print(f"The minimum value equals : {Min_value} --- The number of value {Min_value} is equal {count_value}")
print('*'*100)
# most value duplication
most_common_value, count = most_common_Vales(value)
print(f"The maximum duplication value equals : {most_common_value} --- The number of duplications is greater than the duplication value is equal to : {count}")
def create_pairplot(df, numerical_cols):
g = sns.pairplot(df[numerical_cols], diag_kind="kde") # Use kdeplot for diagonal
# Customize the plot using Matplotlib
g.fig.suptitle('Pairplot of Numerical Features', y=1.02)
plot.tight_layout()
# Display the plot
plot.show()
def create_boxplots(df):
fig, ax = plot.subplots(figsize=(10, 6))
df.plot.box(vert=True, ax=ax)
ax.set_title("Boxplot for All Numerical Columns")
ax.set_xlabel("Value")
ax.set_ylabel("Column Name")
plot.show()
df_LAP = pd.read_csv('Loan approval prediction.csv')
df=df_LAP.copy()
df.sample(n=25, random_state=49)
| id | person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade | loan_amnt | loan_int_rate | loan_percent_income | cb_person_default_on_file | cb_person_cred_hist_length | loan_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56796 | 56796 | 29 | 48000 | MORTGAGE | 5.0 | PERSONAL | A | 10000 | 7.51 | 0.21 | N | 10 | 0 |
| 40685 | 40685 | 35 | 74000 | RENT | 15.0 | EDUCATION | C | 15000 | 12.68 | 0.20 | N | 5 | 0 |
| 12572 | 12572 | 27 | 68000 | MORTGAGE | 11.0 | HOMEIMPROVEMENT | A | 10750 | 7.51 | 0.16 | N | 5 | 0 |
| 2585 | 2585 | 25 | 90000 | MORTGAGE | 4.0 | EDUCATION | C | 5500 | 12.73 | 0.06 | Y | 4 | 0 |
| 1032 | 1032 | 36 | 80000 | RENT | 6.0 | EDUCATION | A | 10000 | 6.17 | 0.13 | N | 15 | 0 |
| 12271 | 12271 | 43 | 80000 | MORTGAGE | 1.0 | VENTURE | B | 7800 | 11.49 | 0.09 | N | 11 | 0 |
| 53475 | 53475 | 36 | 55000 | MORTGAGE | 3.0 | PERSONAL | B | 4000 | 10.59 | 0.07 | N | 17 | 0 |
| 11869 | 11869 | 25 | 87000 | RENT | 0.0 | HOMEIMPROVEMENT | A | 8500 | 9.38 | 0.10 | N | 3 | 0 |
| 7035 | 7035 | 29 | 52000 | MORTGAGE | 3.0 | VENTURE | A | 5000 | 5.99 | 0.10 | N | 5 | 0 |
| 25745 | 25745 | 27 | 90000 | MORTGAGE | 4.0 | PERSONAL | A | 6000 | 7.51 | 0.07 | N | 8 | 0 |
| 24878 | 24878 | 23 | 105000 | MORTGAGE | 7.0 | DEBTCONSOLIDATION | A | 12400 | 5.42 | 0.12 | N | 3 | 0 |
| 2144 | 2144 | 25 | 90000 | MORTGAGE | 9.0 | HOMEIMPROVEMENT | A | 12000 | 6.92 | 0.13 | N | 4 | 0 |
| 20226 | 20226 | 28 | 24000 | RENT | 0.0 | MEDICAL | B | 3000 | 12.42 | 0.13 | N | 6 | 0 |
| 38225 | 38225 | 31 | 100995 | MORTGAGE | 6.0 | DEBTCONSOLIDATION | A | 16000 | 7.51 | 0.16 | N | 6 | 0 |
| 49636 | 49636 | 23 | 49000 | RENT | 6.0 | PERSONAL | C | 11600 | 15.27 | 0.24 | Y | 2 | 0 |
| 40038 | 40038 | 27 | 95000 | RENT | 1.0 | PERSONAL | C | 5600 | 14.65 | 0.05 | Y | 8 | 0 |
| 37747 | 37747 | 38 | 65000 | MORTGAGE | 1.0 | EDUCATION | B | 9500 | 10.00 | 0.15 | N | 14 | 0 |
| 44723 | 44723 | 27 | 63000 | RENT | 3.0 | MEDICAL | C | 7200 | 15.27 | 0.11 | N | 5 | 0 |
| 50655 | 50655 | 28 | 120000 | MORTGAGE | 0.0 | MEDICAL | B | 15000 | 10.99 | 0.13 | N | 7 | 0 |
| 53879 | 53879 | 25 | 30000 | RENT | 3.0 | VENTURE | A | 6400 | 7.49 | 0.22 | N | 4 | 0 |
| 22964 | 22964 | 32 | 69000 | MORTGAGE | 4.0 | DEBTCONSOLIDATION | B | 12000 | 10.99 | 0.17 | N | 7 | 1 |
| 37494 | 37494 | 22 | 40000 | RENT | 1.0 | DEBTCONSOLIDATION | B | 15000 | 10.65 | 0.38 | N | 2 | 1 |
| 57394 | 57394 | 25 | 60000 | RENT | 2.0 | EDUCATION | B | 15000 | 12.53 | 0.25 | N | 3 | 0 |
| 55071 | 55071 | 22 | 85000 | RENT | 6.0 | EDUCATION | B | 7500 | 10.08 | 0.09 | N | 3 | 0 |
| 42195 | 42195 | 29 | 36000 | RENT | 2.0 | EDUCATION | C | 5500 | 13.57 | 0.15 | N | 7 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 58645 entries, 0 to 58644 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 58645 non-null int64 1 person_age 58645 non-null int64 2 person_income 58645 non-null int64 3 person_home_ownership 58645 non-null object 4 person_emp_length 58645 non-null float64 5 loan_intent 58645 non-null object 6 loan_grade 58645 non-null object 7 loan_amnt 58645 non-null int64 8 loan_int_rate 58645 non-null float64 9 loan_percent_income 58645 non-null float64 10 cb_person_default_on_file 58645 non-null object 11 cb_person_cred_hist_length 58645 non-null int64 12 loan_status 58645 non-null int64 dtypes: float64(3), int64(6), object(4) memory usage: 5.8+ MB
#total null
df.isna().sum()
id 0 person_age 0 person_income 0 person_home_ownership 0 person_emp_length 0 loan_intent 0 loan_grade 0 loan_amnt 0 loan_int_rate 0 loan_percent_income 0 cb_person_default_on_file 0 cb_person_cred_hist_length 0 loan_status 0 dtype: int64
#Total duplicated
print("duplicated = ",df.duplicated().sum())
duplicated = 0
#Segment data by Dtype
df=df.drop("id",axis=1)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
object_cols=df.select_dtypes(include=['object']).columns
df.describe().style.background_gradient(cmap='viridis', axis=1)
| person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_percent_income | cb_person_cred_hist_length | loan_status | |
|---|---|---|---|---|---|---|---|---|
| count | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 |
| mean | 27.550857 | 64046.172871 | 4.701015 | 9217.556518 | 10.677874 | 0.159238 | 5.813556 | 0.142382 |
| std | 6.033216 | 37931.106979 | 3.959784 | 5563.807384 | 3.034697 | 0.091692 | 4.029196 | 0.349445 |
| min | 20.000000 | 4200.000000 | 0.000000 | 500.000000 | 5.420000 | 0.000000 | 2.000000 | 0.000000 |
| 25% | 23.000000 | 42000.000000 | 2.000000 | 5000.000000 | 7.880000 | 0.090000 | 3.000000 | 0.000000 |
| 50% | 26.000000 | 58000.000000 | 4.000000 | 8000.000000 | 10.750000 | 0.140000 | 4.000000 | 0.000000 |
| 75% | 30.000000 | 75600.000000 | 7.000000 | 12000.000000 | 12.990000 | 0.210000 | 8.000000 | 0.000000 |
| max | 123.000000 | 1900000.000000 | 123.000000 | 35000.000000 | 23.220000 | 0.830000 | 30.000000 | 1.000000 |
df.describe(include='object')
| person_home_ownership | loan_intent | loan_grade | cb_person_default_on_file | |
|---|---|---|---|---|
| count | 58645 | 58645 | 58645 | 58645 |
| unique | 4 | 6 | 7 | 2 |
| top | RENT | EDUCATION | A | N |
| freq | 30594 | 12271 | 20984 | 49943 |
#Data trend
trend=[]
col=[]
for i in df.columns:
if df[i].dtype!='object':
tr=Data_trend(df[i])
col.append(i)
trend.append(tr)
trend_=pd.DataFrame({'Cols': col, 'DataTrend': trend})
trend_
| Cols | DataTrend | |
|---|---|---|
| 0 | person_age | Data Trend equel : Right |
| 1 | person_income | Data Trend equel : Right |
| 2 | person_emp_length | Data Trend equel : Right |
| 3 | loan_amnt | Data Trend equel : Right |
| 4 | loan_int_rate | Data Trend equel : Left |
| 5 | loan_percent_income | Data Trend equel : Right |
| 6 | cb_person_cred_hist_length | Data Trend equel : Right |
| 7 | loan_status | Data Trend equel : Right |
cor=df.corr(numeric_only=True).T.style.background_gradient(cmap='viridis', axis=1)
cor
| person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_percent_income | cb_person_cred_hist_length | loan_status | |
|---|---|---|---|---|---|---|---|---|
| person_age | 1.000000 | 0.102176 | 0.121276 | 0.050378 | 0.009653 | -0.031975 | 0.874260 | -0.001130 |
| person_income | 0.102176 | 1.000000 | 0.164042 | 0.310942 | -0.057611 | -0.280314 | 0.082727 | -0.169956 |
| person_emp_length | 0.121276 | 0.164042 | 1.000000 | 0.092046 | -0.101910 | -0.065824 | 0.102842 | -0.100428 |
| loan_amnt | 0.050378 | 0.310942 | 0.092046 | 1.000000 | 0.113582 | 0.647266 | 0.045720 | 0.144982 |
| loan_int_rate | 0.009653 | -0.057611 | -0.101910 | 0.113582 | 1.000000 | 0.152201 | 0.007535 | 0.338948 |
| loan_percent_income | -0.031975 | -0.280314 | -0.065824 | 0.647266 | 0.152201 | 1.000000 | -0.023202 | 0.378280 |
| cb_person_cred_hist_length | 0.874260 | 0.082727 | 0.102842 | 0.045720 | 0.007535 | -0.023202 | 1.000000 | -0.003030 |
| loan_status | -0.001130 | -0.169956 | -0.100428 | 0.144982 | 0.338948 | 0.378280 | -0.003030 | 1.000000 |
# colunm by colunm Numeric_only
for i in col:
print("Analysis : ",i)
data_trend=Data_trend(df[i])
print(data_trend)
fig=px.box(df[i],color_discrete_sequence=px.colors.qualitative.Dark24,
template="seaborn",title='Box Person income')
fig.show()
fig=px.histogram(df[i],color_discrete_sequence=px.colors.qualitative.Dark24,
template="seaborn",title='Histogram Person income')
fig.show()
print("#"*100)
Analysis : person_age Data Trend equel : Right
#################################################################################################### Analysis : person_income Data Trend equel : Right
#################################################################################################### Analysis : person_emp_length Data Trend equel : Right
#################################################################################################### Analysis : loan_amnt Data Trend equel : Right
#################################################################################################### Analysis : loan_int_rate Data Trend equel : Left
#################################################################################################### Analysis : loan_percent_income Data Trend equel : Right
#################################################################################################### Analysis : cb_person_cred_hist_length Data Trend equel : Right
#################################################################################################### Analysis : loan_status Data Trend equel : Right
####################################################################################################
person_home_ownership_group=df['person_home_ownership'].value_counts()
fig= px.bar( x=person_home_ownership_group.index , y =person_home_ownership_group , text=person_home_ownership_group ,
color_discrete_sequence=px.colors.qualitative.Dark24,template='seaborn',title='Distribution of person home ownership' )
fig.show()
loan_intent_group=df['loan_intent'].value_counts()
fig=px.bar(x=loan_intent_group.index , y =loan_intent_group,text=loan_intent_group,
color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
title='Distribution of Loan Intent')
fig.show()
loan_grade_group=df['loan_grade'].value_counts()
fig=px.bar(x=loan_grade_group.index , y =loan_grade_group,text=loan_grade_group,
color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
title='Distribution of loan grade')
fig.show()
cb_person_default_on_file_group=df['cb_person_default_on_file'].value_counts()
fig=px.bar(x=cb_person_default_on_file_group.index , y =cb_person_default_on_file_group,text=cb_person_default_on_file_group,
color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
title='Distribution of cb person default on file')
fig.show()
loan_status_group=df.groupby('loan_status')['loan_status'].count()
fig=px.bar(x=loan_status_group.index , y=loan_status_group.values , text=loan_status_group.values ,
color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
title='Distribution of loan status')
fig.show()
for i in col:
print( i ," Distribution by Grade")
plot.figure(figsize=(15, 6))
sb.boxplot(data=df, x='loan_grade', y=i)
plot.title('Loan Amount Distribution by Grade')
plot.show()
print("#"*100)
person_age Distribution by Grade
#################################################################################################### person_income Distribution by Grade
#################################################################################################### person_emp_length Distribution by Grade
#################################################################################################### loan_amnt Distribution by Grade
#################################################################################################### loan_int_rate Distribution by Grade
#################################################################################################### loan_percent_income Distribution by Grade
#################################################################################################### cb_person_cred_hist_length Distribution by Grade
#################################################################################################### loan_status Distribution by Grade
####################################################################################################
for i in col:
print( i ," Distribution by loan intent")
plot.figure(figsize=(15, 6))
sb.boxplot(data=df, x='loan_intent', y=i)
plot.title('Loan Amount Distribution by loan intent')
plot.show()
print("#"*100)
person_age Distribution by loan intent
#################################################################################################### person_income Distribution by loan intent
#################################################################################################### person_emp_length Distribution by loan intent
#################################################################################################### loan_amnt Distribution by loan intent
#################################################################################################### loan_int_rate Distribution by loan intent
#################################################################################################### loan_percent_income Distribution by loan intent
#################################################################################################### cb_person_cred_hist_length Distribution by loan intent
#################################################################################################### loan_status Distribution by loan intent
####################################################################################################
for i in col:
print( i ," Distribution by person home ownership")
plot.figure(figsize=(15, 6))
sb.boxplot(data=df, x='person_home_ownership', y=i)
plot.title('Loan Amount Distribution by person home ownership')
plot.show()
print("#"*100)
person_age Distribution by person home ownership
#################################################################################################### person_income Distribution by person home ownership
#################################################################################################### person_emp_length Distribution by person home ownership
#################################################################################################### loan_amnt Distribution by person home ownership
#################################################################################################### loan_int_rate Distribution by person home ownership
#################################################################################################### loan_percent_income Distribution by person home ownership
#################################################################################################### cb_person_cred_hist_length Distribution by person home ownership
#################################################################################################### loan_status Distribution by person home ownership
####################################################################################################
for i in col:
print( i ," Distribution by cb person default on file")
plot.figure(figsize=(15, 6))
sb.boxplot(data=df, x='cb_person_default_on_file', y=i)
plot.title('Loan Amount Distribution by cb person default on file')
plot.show()
print("#"*100)
person_age Distribution by cb person default on file
#################################################################################################### person_income Distribution by cb person default on file
#################################################################################################### person_emp_length Distribution by cb person default on file
#################################################################################################### loan_amnt Distribution by cb person default on file
#################################################################################################### loan_int_rate Distribution by cb person default on file
#################################################################################################### loan_percent_income Distribution by cb person default on file
#################################################################################################### cb_person_cred_hist_length Distribution by cb person default on file
#################################################################################################### loan_status Distribution by cb person default on file
####################################################################################################
# Pairplot for numerical features
sb.pairplot(df)
plot.suptitle('Pairplot of Numerical Features', y=1.02)
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
# Correlation Heatmap
plot.figure(figsize=(12, 8))
numerical_cols = df.select_dtypes(include=[np.number]).columns
sb.heatmap(df[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plot.title('Correlation Heatmap')
plot.show()
# Histograms for numerical features
df.hist(bins=30, figsize=(15, 10))
plot.suptitle('Histograms of Numerical Features')
plot.show()
# Group the DataFrame by 'person_age' and count the number of occurrences of 'person_income'
loan_count_by_age = df.groupby('person_age')['person_income'].count().reset_index()
# Rename the columns for clarity
loan_count_by_age.columns = ['person_age', 'loan_count']
# Convert the grouped data into a DataFrame (this step is redundant as it's already a DataFrame)
loan_count_by_age = pd.DataFrame(loan_count_by_age)
# Create a new column 'age_group' by categorizing 'person_age' into defined bins
loan_count_by_age['age_group'] = pd.cut(loan_count_by_age['person_age'],
bins=[0, 30, 50, 100],
labels=[ 'Young Adult', 'Middle Age', 'Senior'])
# Group the data by 'age_group' and sum the 'loan_count' for each age group
loan_count_by_age = loan_count_by_age.groupby('age_group')['loan_count'].sum().reset_index()
# Display the final DataFrame with loan counts by age group
loan_count_by_age=pd.DataFrame(loan_count_by_age)
loan_count_by_age
fig=px.bar(y=loan_count_by_age['loan_count'],x=loan_count_by_age['age_group'],text=loan_count_by_age['loan_count'],
color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
title='Distribution of loan_count by age_group' )
fig.show()
object_columns = df.select_dtypes(include='object').columns
numeric_columns =df.select_dtypes(include=['int', 'float64']).columns
for column in object_columns:
plot.figure(figsize=(12, 5))
order = sorted(df[column].unique())
sb.countplot(data=df, x=column,hue='loan_status',order=order)
plot.title(f'Countplot of {column}')
plot.tight_layout()
plot.show()
for column in numeric_columns:
sb.boxplot(x='loan_status', y=column, data=df)
plot.xlabel('Loan Status')
plot.ylabel(column)
plot.title('Box Plot of {} vs Loan Status'.format(column))
plot.show()
viridis_colors = {
0: '#440154',
1: '#3B528B'
}
g = sb.FacetGrid(df, col="loan_status", hue="loan_status", height=5, aspect=1.5, palette=viridis_colors)
g.map(sb.histplot, 'loan_amnt', kde=True)
g.add_legend()
plot.subplots_adjust(top=0.85)
g.fig.suptitle('Loan Amount Distribution by Loan Status')
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
viridis_colors = {
0: '#440154',
1: '#3B528B'
}
g = sb.FacetGrid(df, col="loan_status", hue="loan_status", height=5, aspect=1.5, palette=viridis_colors)
g.map(sb.histplot, 'person_income', kde=True)
g.add_legend()
plot.subplots_adjust(top=0.85)
g.fig.suptitle('Person Income Distribution by Loan Status')
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight